In [1]:
from sklearn import tree
In [2]:
features = [[140, 1], [130, 1], [150, 0], [170, 0]]
labels = [0, 0, 1, 1]
In [4]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features, labels)
In [7]:
print(clf.predict([[120, 0]]))
In [ ]:
In [15]:
from sklearn.datasets import load_iris
import numpy as np
iris = load_iris()
print(iris.feature_names)
print(iris.target_names)
print(iris.data[0])
print(iris.target[0])
In [23]:
test_idx = [0, 50, 100]
# training data
train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)
print(train_target.shape)
print(train_data.shape)
# testing data
test_target = iris.target[test_idx]
test_data = iris.data[test_idx]
print(test_target.shape)
print(test_data.shape)
In [24]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)
Out[24]:
In [25]:
print(test_target)
print(clf.predict(test_data))
In [32]:
# viz code
from sklearn.externals.six import StringIO
import pydotplus
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded = True,
impurity=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('iris.pdf')
Out[32]:
In [36]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
greyhounds = 500
labs = 500
grey_height = 28 + 4 * np.random.randn(greyhounds)
lab_height = 24 + 4 * np.random.randn(labs)
plt.hist([grey_height, lab_height], stacked=True, color=['r', 'b'])
plt.show()
35 肯定是 greyhounds
20左右是 lab的几率最大
但是很难判断在25左右的时候是谁. 所以这个 Feature 是好的, 但不是充分的.
In [40]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data # input: features
y = iris.target # output: label
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .5)
# from sklearn import tree
# my_classifier = tree.DecisionTreeClassifier()
from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier()
my_classifier.fit(X_train, y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))
In [71]:
from scipy.spatial import distance
def euc(a, b):
return distance.euclidean(a, b)
class ScrappyKNN():
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.y_train[best_index]
In [72]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data # input: features
y = iris.target # output: label
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .5)
my_classifier = ScrappyKNN()
my_classifier.fit(X_train, y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))
In [ ]: